* Program to create analysis for SDC thresholds paper * Created 2nd August 2019 by Felix Ritchie * * General principle: * get or create some data * decide on X, Y and Z catoegirsations * map counts of X vs Y, X vs Z subject to a threshold * calculate the table X vs (|Y-Z|) * calc various stats of 1s and 2s in that table * * Last modified: * dd.mm.yy vxx Who * quietly { * set up capture log close set more off clear all local vers_no = "01" set seed 20190802 local working_folder = "C:\Users\fj-ritchie\OneDrive - UWE Bristol (Staff)\research\papers\10 is the safest number" foreach case in 1 2 3 { foreach type in a s { local notfirst`case'`type' = 0 } } * Macros to run the code - n_obs and n_its refer to the simulated data only global thresholds = "3 4 5 6 7 8 9 10 11 12 13 14 15 20 25 30" local n_obs = 5000 local n_its = 1000 local use_uniform = 1 local analyses_to_run = "simulated LFS LFSlowpay charity" * local analyses_to_run = "LFS LFSlowpay charity" * local analyses_to_run = "simulated" local cat_types = "u uexp" local yvars = "y z w q" local n_age_cats = 10 local n_ageexp_cats = 5 local urban_pc = "70 80 90 95" local homeowner_pc "70 80 90 95" local degree_pc = "15 10 5" local white_pc = "90 95 99" label define ylabel 0 "rural" 1 "urban" label define zlabel 0 "renting" 1 "owner" label define qlabel 0 "no degree" 1 "degree" label define wlabel 0 "nonwhite" 1 "white" * Show useful information local logfile = "`working_folder'\\data_v`vers_no'_obs`n_obs'_its`n_its'_uniform`use_uniform'_`analyses_to_run'.smcl" log using "`logfile'", replace set linesize 255 local uniform = "e" if `use_uniform' { local uniform = "u" } local res_name = "`working_folder'\results_`uniform'`n_obs'_case" noisily display "*****************************************************" noisily display "Run parameters:" noisily display "Version no : `vers_no'" noisily display "Number of observations : `n_obs'" noisily display "Number of iterations : `n_its'" noisily display "Uniform age dist : `use_uniform' (`uniform')" noisily display "Thresholds : `thresholds'" noisily display "Simulated data values:" noisily display "Urban % : `urban_pc'" noisily display "Homeowner % : `homeowner_pc'" noisily display "Degree % : `degree_pc'" noisily display "White % : `white_pc'" noisily display "Files:" noisily display "Log file : `logfile'" noisily display "Results file : `res_name'N_act/sim" noisily display "*****************************************************" * Now loop for each of the simulated data, teh LFS and the charity data foreach source in `analyses_to_run' { * set up the variable types local y_pc = 0 local z_pc = 0 local q_pc = 0 local w_pc = 0 local type = "a" if "`source'" == "simulated" { noisily display "Conceptual analysis using simulated data" noisily display "========================================" local obs_and_its = `n_obs'*`n_its' set obs `obs_and_its' gen it_no = mod(_n,`n_its') gen int ageu = runiform()*`n_age_cats' replace ageu = `n_age_cats' - 1 if ageu>=`n_age_cats' gen int ageexp = exp(ageu/5)-1 replace ageexp = `n_ageexp_cats'-1 if ageexp>=`n_ageexp_cats' if `use_uniform' { gen age = ageu local divisor = `n_age_cats'*2 } else { gen age = ageexp local divisor = `n_ageexp_cats'*2 } ren age x local y_pc = "`urban_pc'" local z_pc = "`homeowner_pc'" local q_pc = "`degree_pc'" local w_pc = "`white_pc'" foreach vv in `yvars' { gen base`vv' = (runiform()*100) local nn = 0 foreach vpc in ``vv'_pc' { local nn = `nn'+1 gen byte `vv'`nn' = base`vv' < `vpc' label values `vv'`nn' `vv'label } } local type = "s" } else if "`source'"=="LFS" { noisily display "LFS data" noisily display "========================================" use "C:\Users\fj-ritchie\OneDrive - UWE Bristol (Staff)\consulting\ONS\output checking training\materials\SDC examples\lfs2002.dta", clear keep if age>=50 ren sex female gen england = region<9 gen degree = hiquald==1 gen white = eth==1 ren ages x ren female y1 ren england z1 ren degree q1 ren white w1 gen it_no = 1 local divisor = 4*2 } else if "`source'"=="LFSlowpay" { noisily display "LFS data (low paid only)" noisily display "========================================" use "C:\Users\fj-ritchie\OneDrive - UWE Bristol (Staff)\consulting\ONS\output checking training\materials\SDC examples\lfs2002.dta", clear keep if age>=50 keep if hourpay<10 & hourpay>0 ren sex female gen england = region<9 gen degree = hiquald==1 gen white = eth==1 ren ages x ren female y1 ren england z1 ren degree q1 ren white w1 gen it_no = 1 local divisor = 4*2 } else { noisily display "Charity data" noisily display "========================================" use "C:\Users\fj-ritchie\felix working files\november 2018\charity funding\charities 150 v12.dta", clear drop if inc_total==. drop if year==2015 sort charity by charity: egen m_inc = mean(inc_total) gen big = m_inc>1000000 gen secure = sh_assets_total_costs>12 gen surplus = balance>0 ren year x ren big y1 ren survivor z1 ren secure q1 ren surplus w1 gen it_no = 1 local divisor = 5*2 } * and now the calculations * note that 'the 'urban' split applies to all, the others are for cases 1-3 gen n_obs = 1 local yn = 0 foreach ypc in `y_pc' { local yn = `yn'+1 ren y`yn' y noisily display "y=`ypc' " noisily display "***************" ******************************* repeat for each 'urban' option * Ii.1 first differenced table (Case 1) noisily display "Calculations for Case 1..." local zn = 0 foreach zpc in `z_pc' { local zn = `zn'+1 ren z`zn' z noisily display "Case 1: z=`zpc' Tables 1, 2 and 2a" noisily tab x y noisily tab x y if z noisily tab x y if z==0 preserve collapse (sum) n_obs , by(it_no x y z) sort it_no y x z by it_no y x: egen t1 = total(n_obs) by it_no y x: egen t2h = total(n_obs) if z by it_no y x: egen t2ar = total(n_obs) if z==0 by it_no y x: gen t2 = t2h[_N] by it_no y x: gen t2a = t2a[1] by it_no y x: drop if _n>1 by it_no y: egen t1median = median(t1) by it_no: gen t1umedian = t1median[_N] by it_no: gen t1rmedian = t1median[1] drop z n_obs t2h t2ar t1median foreach threshold in $thresholds { gen t1clean = t1 if t1>=`threshold' gen t2clean = t2 if t2>=`threshold' gen t2aclean = t2a if (t1clean~=. ) & (t2clean~=.) gen bad = (t2aclean ==1) | (t2aclean==2) gen ok1 = t1clean ~= . gen ok2 = t2clean ~= . sort it_no y foreach pp in ok1 ok2 bad { by it_no: egen `pp'_prop = total(`pp') * divide by appopriate divisor, not _N as there might be some cells with no values, but 0 is a potential value by it_no: replace `pp'_prop = `pp'_prop/`divisor' } foreach vv in bad bad_prop ok1_prop ok2_prop { ren `vv' `vv'`threshold' } drop t1clean t2clean t2aclean ok1 ok2 } drop x y t1 t2 t2a by it_no: keep if _n==1 reshape long bad bad_prop ok1_prop ok2_prop, i(it_no t1umedian t1rmedian) j(threshold) noisily display "Case 1 results:" noisily tab bad_prop threshold noisily tab ok1_prop threshold noisily tab ok2_prop threshold gen problems = 0 replace problems = 1 if bad_prop>0 & bad_prop<1 replace problems = 2 if bad_prop==1 label define probs 0 "none" 1 "some" 2 "all" label values problems probs noisily tab threshold problems if "`source'"=="simulated" { local results = "`res_name'1_sim.dta" gen ypc = `ypc' gen zpc = `zpc' } else { local results = "`res_name'1_act.dta" gen src = "`source'" } if `notfirst1`type'' { append using "`results'" } save "`results'" , replace local notfirst1`type' = 1 restore ren z z`zn' } * II.2 Unsuppression through marginal totals (Case 2) noisily display "Calculations for Case 2..." local qn = 0 foreach qpc in `q_pc' { local qn = `qn'+1 ren q`qn' q noisily display "Case q=`qpc' Tables 1 and 3" noisily tab x y noisily tab x q preserve collapse (sum) n_obs , by(it_no x y q) sort it_no x y by it_no x y: egen t1 = total(n_obs) by it_no x: gen t1u = t1[_N] by it_no x: gen t1r = t1[1] sort it_no x q by it_no x q: egen t3 = total(n_obs) by it_no x: gen t3d = t3[_N] by it_no x: gen t3n = t3[1] by it_no x: drop if _n>1 foreach row_item in t1u t1r t3d t3n { by it_no: egen `row_item'median = median(`row_item') } foreach threshold in $thresholds { foreach row_item in t1u t1r t3d t3n { gen `row_item'clean = `row_item' if `row_item'>=`threshold' * zero treated as below threshold, so this next bit is fine mvencode `row_item'clean , mv(0) } gen ok1 = 0.5*( ((t1u==0) | (t1u>`threshold')) + ((t1r==0) | (t1r>`threshold')) ) gen ok3 = 0.5*( ((t3d==0) | (t3d>`threshold')) + ((t3n==0) | (t3n>`threshold')) ) gen t1gap = ((t1uclean==0) & (t1rclean~=0)) | ((t1uclean~=0) & (t1rclean==0)) gen t3gap = ((t3dclean==0) & (t3nclean~=0)) | ((t3dclean~=0) & (t3nclean==0)) gen bad = (t1gap | t3gap) & (t1gap~=t3gap) foreach pp in ok1 ok3 bad { by it_no: egen `pp'_prop = total(`pp') * divide by appopriate divisor, not _N as there might be some cells with no values, but 0 is a potential value * only one row in each table, so halve denominator by it_no: replace `pp'_prop = 2*`pp'_prop/`divisor' } foreach vv in bad_prop ok1_prop ok3_prop { ren `vv' `vv'`threshold' } drop bad t??clean t?gap ok1 ok3 } drop x q t1 t3 t1? t3? by it_no: keep if _n==1 reshape long bad_prop ok1_prop ok3_prop, i(it_no t??median) j(threshold) noisily tab bad_prop threshold noisily tab ok3_prop threshold gen problems = 0 replace problems = 1 if bad_prop>0 & bad_prop<1 replace problems = 2 if bad_prop==1 label define probs 0 "none" 1 "some" 2 "all" label values problems probs noisily tab threshold problems if "`source'"=="simulated" { local results = "`res_name'2_sim.dta" gen ypc = `ypc' gen qpc = `qpc' } else { local results = "`res_name'2_act.dta" gen src = "`source'" } if `notfirst2`type'' { append using "`results'" } save "`results'" , replace local notfirst2`type' = 1 restore ren q q`qn' } * II.3 Disclosure through negation (Case 3) noisily display "Calculations for Case 3..." local wn = 0 foreach wpc in `w_pc' { local wn = `wn'+1 ren w`wn' w noisily display "Case w=`wpc' Tables 4 and 4a" noisily table x y, contents(freq mean w) noisily tab x y if w==0 preserve collapse (sum) n_obs (mean) w, by(it_no x y) sort it_no x y ren n_obs t4 gen t4a = round((1-w)*t4) drop y w foreach vv in t4 t4a { by it_no: egen `vv'median = median(`vv') } foreach threshold in $thresholds { gen bad = 0 replace bad = 1 if (t4>=`threshold') & ((t4a==1)|(t4a==2)) gen ok4 = (t4>=`threshold')| (t4==0) foreach pp in ok4 bad { by it_no: egen `pp'_prop = total(`pp') * divide by appopriate divisor, not _N as there might be some cells with no values, but 0 is a potential value by it_no: replace `pp'_prop = `pp'_prop/`divisor' } foreach vv in bad_prop ok4_prop { ren `vv' `vv'`threshold' } drop bad ok4 } keep t4median t4amedian bad_prop* ok4_prop* it_no by it_no: keep if _n==1 reshape long bad_prop ok4_prop, i(it_no t4median t4amedian ) j(threshold) noisily tab bad_prop threshold noisily tab ok4_prop threshold gen problems = 0 replace problems = 1 if bad_prop>0 & bad_prop<1 replace problems = 2 if bad_prop==1 label define probs 0 "none" 1 "some" 2 "all" label values problems probs noisily tab threshold problems if "`source'"=="simulated" { local results = "`res_name'3_sim.dta" gen ypc = `ypc' gen wpc = `wpc' } else { local results = "`res_name'3_act.dta" gen src = "`source'" } if `notfirst3`type'' { append using "`results'" } save "`results'" , replace local notfirst3`type' = 1 restore ren w w`wn' } ******************************* ren y y`yn' } } log close }